# -*- coding: utf-8 -*-
'''
Created on Apr 11, 2012

@author: LONG HOANG GIANG
'''
import os, sys
sys.path.append(os.path.expanduser('/home5/vietcntt/longhoanggiang/python'))
sys.path.append(os.path.join(os.path.dirname(__file__), '../'))
from CrawlerLib import commonlib, Log, Http
import datetime
import re
import gzip
import simplejson as json
from urlparse import urljoin


def getChapter(name, url):
    print 'start getChapter: {0}'.format(name)
    tree = Http.getXMLTree(url)
    index = 0
    itemChapter = {'chapter': name, 'images': []}
    for item in tree.xpath("//textarea[@id='truyen18-eedit']/p/img"):
        index += 1
        link = item.get('src')
        link = re.sub("s\.jpg", '.jpg', link)
        if not 'i.imgur.com' in link: continue
        if 'http://i.imgur.com/ZEdvu.jpg' == link: continue
        itemChapter['images'].append(link)
    print 'finished chapter {0}'.format(name)
    return itemChapter

def process():
    url = 'http://www.truyen18.org/truyen/dung-si-hesman---nguoi-may-hesman---voltron/2490.html'
    tree = Http.getXMLTree(url)
    data = []
    for item in tree.xpath("//table[@class='listing']/tbody/tr[position()>1]/td[1]/a"):
        name = commonlib.stringify(item)
        link = item.get('href', '')
        if link == '': continue
        link = urljoin(url, link)
        data.append(getChapter(name, link))
    f = gzip.open('/dungsihesman.gz', 'wb')
    f.write(json.dumps(data))
    f.close()

if __name__ == '__main__':
    
    logger = Log.getLogger("dungsihesman")
    process()
    logger.debug(">> Finished at {0}".format(datetime.datetime.now()))
    os._exit(1)